knitr::opts_chunk$set(message=FALSE, warning=FALSE, eval=T, cache=F)

Intro

For today’s workshop, we’re going to use R to go through a typical bioinformatics analysis workflow. We’re going to use common bioinformatics techniques to visualize data and make beautiful figures.

The data we will analyze is breast cancer RNA-Seq data from TCGA, a popular publicly-available database for cancer-related datasets.

Make sure to have the following packages installed for this workshop:

Working with Expression Set Objects

An expression set is a data object consisting of three entities: the expression matrix (exprs), the phenotye data (pData), and the feature data (fData).

We read in the RDS file included in this repo. It corresponds to a subset of samples from a gene expression dataset of breast cancer (BRCA) primary tissue samples from the TCGA project.

#packages we will need for this workshop
library(Biobase)
library(magrittr)
library(dplyr)
library(ggplot2)
library(ggfortify)
library(plotly)
library(pheatmap)
library(ComplexHeatmap)
library(RColorBrewer)
brca <- readRDS("data/TCGA-BRCA.rds")

# dimensions of the expression data
dim(brca)
## Features  Samples 
##    36812     1222
# dimensions of the gene annotation
dim(fData(brca))
## [1] 36812     4
# first few rows of gene annotations
head(fData(brca)[,c("ensembl_transcript_id", "ensembl_gene_id", "hgnc_symbol")])
##          ensembl_transcript_id ensembl_gene_id hgnc_symbol
## TSPAN6      ENSG00000000003.13 ENSG00000000003      TSPAN6
## TNMD         ENSG00000000005.5 ENSG00000000005        TNMD
## DPM1        ENSG00000000419.11 ENSG00000000419        DPM1
## SCYL3       ENSG00000000457.12 ENSG00000000457       SCYL3
## C1orf112    ENSG00000000460.15 ENSG00000000460    C1orf112
## FGR         ENSG00000000938.11 ENSG00000000938         FGR
# dimensions of the phenotypic annotation
dim(pData(brca))
## [1] 1222   65
# first few rows of phenotype
head(pData(brca)[,c("patient_id", "sample_type", "tumor_subtype")])
##                                patient_id   sample_type tumor_subtype
## TCGA-A8-A085-01A-11R-A00Z-07 TCGA-A8-A085 Primary Tumor          LumB
## TCGA-A2-A0SY-01A-31R-A084-07 TCGA-A2-A0SY Primary Tumor          LumA
## TCGA-AR-A24Z-01A-11R-A169-07 TCGA-AR-A24Z Primary Tumor          LumB
## TCGA-D8-A1XU-01A-11R-A14M-07 TCGA-D8-A1XU Primary Tumor          LumA
## TCGA-A1-A0SN-01A-11R-A144-07 TCGA-A1-A0SN Primary Tumor          Her2
## TCGA-D8-A73W-01A-22R-A352-07 TCGA-D8-A73W Primary Tumor          LumB
# how many of each sample type?
table(pData(brca)$sample_type)
## 
##          Metastatic       Primary Tumor Solid Tissue Normal 
##                   7                1102                 113
# how many tumor subtypes?
table(pData(brca)$tumor_subtype)
## 
##  Basal   Her2   LumA   LumB Normal 
##    169    209    510    198     16

Log transform the data set

exprs(brca) <- log2(exprs(brca) + 1)
exprs(brca)[1:5,1:5]
##          TCGA-A8-A085-01A-11R-A00Z-07 TCGA-A2-A0SY-01A-31R-A084-07
## TSPAN6                      13.975579                    10.981567
## TNMD                         1.584963                     6.189825
## DPM1                        11.156715                    10.822571
## SCYL3                       10.590587                    10.946906
## C1orf112                     9.519636                     9.339850
##          TCGA-AR-A24Z-01A-11R-A169-07 TCGA-D8-A1XU-01A-11R-A14M-07
## TSPAN6                      12.302353                    12.463013
## TNMD                         4.459432                     2.807355
## DPM1                        11.945444                    12.266494
## SCYL3                       10.611025                    11.149747
## C1orf112                     9.388017                     9.400879
##          TCGA-A1-A0SN-01A-11R-A144-07
## TSPAN6                       9.714246
## TNMD                         1.000000
## DPM1                        12.419960
## SCYL3                       11.136350
## C1orf112                     9.884171

PCA

Start by ranking genes based on their variation across samples.

row.var <- sort(apply(exprs(brca), 1, var), decreasing=TRUE)
head(row.var)
##   CLEC3A  SCGB2A2     CPB1     TFF1  SCGB1D2    KCNJ3 
## 29.73892 25.49291 24.59669 21.00591 20.25785 19.56774

To save time, we’ll run PCA on the top 2500 most variable genes.

df <- brca[names(row.var)[1:2500]] %>%
      exprs() %>%
      t() %>%
      data.frame()
    
pca <- prcomp(df)
pca.summary <- summary(pca)
pca.summary$importance[,1:5]
##                             PC1      PC2      PC3      PC4      PC5
## Standard deviation     47.42767 40.65232 29.32491 23.58570 20.28785
## Proportion of Variance  0.14934  0.10972  0.05709  0.03693  0.02733
## Cumulative Proportion   0.14934  0.25906  0.31615  0.35309  0.38041

2-D Plot

df$tumor_subtype <- brca$tumor_subtype
autoplot(pca, data=df, colour='tumor_subtype')

3-D Plot

df.pca <- cbind(pca$x[,c(1:3)], brca$tumor_subtype) %>%
          as.data.frame() %>%
          set_colnames(c("PC1", "PC2", "PC3", "tumor_subtype"))

head(df.pca)
##                                            PC1               PC2
## TCGA-A8-A085-01A-11R-A00Z-07 -89.9895594300379  14.4728151158205
## TCGA-A2-A0SY-01A-31R-A084-07  8.62013654000677 -39.5325958432916
## TCGA-AR-A24Z-01A-11R-A169-07 -41.8816734638114 -26.6304385642973
## TCGA-D8-A1XU-01A-11R-A14M-07 -22.8004029255938 -38.3033437021283
## TCGA-A1-A0SN-01A-11R-A144-07 -26.3108798601652  1.88579216258153
## TCGA-D8-A73W-01A-22R-A352-07 -46.7262990133077  6.82995550570086
##                                            PC3 tumor_subtype
## TCGA-A8-A085-01A-11R-A00Z-07 -38.4190093513565          LumB
## TCGA-A2-A0SY-01A-31R-A084-07  7.54869728087252          LumA
## TCGA-AR-A24Z-01A-11R-A169-07  11.3252895436753          LumB
## TCGA-D8-A1XU-01A-11R-A14M-07  24.3111755582573          LumA
## TCGA-A1-A0SN-01A-11R-A144-07    21.69949616053          Her2
## TCGA-D8-A73W-01A-22R-A352-07 -12.9151476451899          LumB
p <- plot_ly(df.pca,
             x = ~PC1,
             y = ~PC2,
             z = ~PC3,
             type="scatter3d",
             mode = "markers",
             color = ~tumor_subtype,
             marker = list(size = 3))

p

Heatmaps

Basic Heatmap with annotations

First, let’s draw a heatmap for the top 100 most variable genes. This can be easily done with pheatmap. You will need to: * Hide the row and column names (genes and samples respectively) as there are too many * Add row annotation as color bars for the Tumor Subtype and the Sample Type * Make sure to scale by each gene so the colors make sense # Hint: it is always adviced to use red for high values and blue for low values in heatmaps

dat <- brca[names(row.var)[1:100]]
pheatmap(exprs(dat),
         color = rev(brewer.pal(11, "RdBu")),
         annotation_col = data.frame(Tumor_Subtype = pData(dat)$tumor_subtype,
                        Sample_Subtype = pData(dat)$sample_type, 
                        row.names = row.names(pData(dat))),
         show_rownames = F,
         show_colnames = F,
         clustering_distance_rows = "euclidean", 
         clustering_distance_cols="euclidean",
         scale= "row"
)

Heatmap with marginal plots

Next, with the same set of genes, let’s add some marginal plots. You will need to: * Plot PC 1 and 2 as line charts on the top of the heatmap to see if there is any association of PCs to tumor subtypes * Plot the mean expression of each gene as bar charts on the right of the plot * Split the heatmap into reasonable number of clusters for row and columns (using k-means partitioning) to see if each cluster contains certain tumor subtypes

The package we will be using is called ComplexHeatmap. While other packages like superheat or pheatmap could serve similar purposes, the ComplexHeatmap, as the name suggests, provide the most comprehensive functionality for drawing a complicated heatmap with tons of information. Once you got familiar with it, it will be relatively easy to learn the other packages.

ca <- HeatmapAnnotation(Tumor_Subtype = pData(dat)$tumor_subtype,
                        Sample_Subtype = pData(dat)$sample_type, 
                        annotation_name_side = "left",
                        PC1 = anno_lines(pca$x[,1]),
                        PC2 = anno_lines(pca$x[,2]))

ra <- rowAnnotation(Mean = anno_barplot(apply(exprs(dat), 1, mean)))

Heatmap(exprs(dat),
        show_row_names = FALSE,
        show_column_names = FALSE,
        column_title = "TCGA BRCA Samples",
        row_title = "Top 100 Variable Genes",
        top_annotation = ca,
        right_annotation = ra,
        column_km = 5,
        row_km = 3)

Data Wrangling Questions

Example Dataset

genes <- c("FOXA1","MLPH","AR","GATA3","DNALI1","FAM47E","RHOB","SPDEF",
"SLC7A8","TTC6","CA12","SMIM14","C5AR2","SIDT1","NOSTRIN","CCDC125",
"FAM198B-AS1","TBC1D9","SLC44A4","DYNLRB2","THSD4","FAM214A","GTF2IP7","SLC22A5",
"CCDC170")

pcs <- brca[genes,] %>%
       exprs() %>%
       t() %>%
       data.frame() %>%
       prcomp() %>%
       .[["x"]]
    
df <- cbind(pcs[,"PC1",drop=F], pData(brca))

head(df)
##                                    PC1                      full_id
## TCGA-A8-A085-01A-11R-A00Z-07 -7.550408 TCGA-A8-A085-01A-11R-A00Z-07
## TCGA-A2-A0SY-01A-31R-A084-07 -6.881499 TCGA-A2-A0SY-01A-31R-A084-07
## TCGA-AR-A24Z-01A-11R-A169-07 -9.659289 TCGA-AR-A24Z-01A-11R-A169-07
## TCGA-D8-A1XU-01A-11R-A14M-07 -7.164274 TCGA-D8-A1XU-01A-11R-A14M-07
## TCGA-A1-A0SN-01A-11R-A144-07 -3.429342 TCGA-A1-A0SN-01A-11R-A144-07
## TCGA-D8-A73W-01A-22R-A352-07 -5.565336 TCGA-D8-A73W-01A-22R-A352-07
##                                patient_id        sample_id
## TCGA-A8-A085-01A-11R-A00Z-07 TCGA-A8-A085 TCGA-A8-A085-01A
## TCGA-A2-A0SY-01A-31R-A084-07 TCGA-A2-A0SY TCGA-A2-A0SY-01A
## TCGA-AR-A24Z-01A-11R-A169-07 TCGA-AR-A24Z TCGA-AR-A24Z-01A
## TCGA-D8-A1XU-01A-11R-A14M-07 TCGA-D8-A1XU TCGA-D8-A1XU-01A
## TCGA-A1-A0SN-01A-11R-A144-07 TCGA-A1-A0SN TCGA-A1-A0SN-01A
## TCGA-D8-A73W-01A-22R-A352-07 TCGA-D8-A73W TCGA-D8-A73W-01A
##                                                           case_id
## TCGA-A8-A085-01A-11R-A00Z-07 3c08aabd-d5b5-4bbe-857c-38a7527b2163
## TCGA-A2-A0SY-01A-31R-A084-07 dc696e3c-f448-468f-a576-f4429be0338a
## TCGA-AR-A24Z-01A-11R-A169-07 9fefbe7c-f66a-4940-843e-285cb7b392c1
## TCGA-D8-A1XU-01A-11R-A14M-07 332148f5-f070-4c20-8eb1-4d8c0673aa52
## TCGA-A1-A0SN-01A-11R-A144-07 0dc337fa-da8b-42c4-b9a7-fb76d81c161f
## TCGA-D8-A73W-01A-22R-A352-07 ea8dbc7a-54c6-469c-865e-f49d00b0223d
##                              submitter_id project_id gender year_of_birth
## TCGA-A8-A085-01A-11R-A00Z-07 TCGA-A8-A085  TCGA-BRCA   male          1964
## TCGA-A2-A0SY-01A-31R-A084-07 TCGA-A2-A0SY  TCGA-BRCA female          1945
## TCGA-AR-A24Z-01A-11R-A169-07 TCGA-AR-A24Z  TCGA-BRCA female          1949
## TCGA-D8-A1XU-01A-11R-A14M-07 TCGA-D8-A1XU  TCGA-BRCA female          1954
## TCGA-A1-A0SN-01A-11R-A144-07 TCGA-A1-A0SN  TCGA-BRCA female          1957
## TCGA-D8-A73W-01A-22R-A352-07 TCGA-D8-A73W  TCGA-BRCA female          1934
##                                      race              ethnicity
## TCGA-A8-A085-01A-11R-A00Z-07 not reported           not reported
## TCGA-A2-A0SY-01A-31R-A084-07        white not hispanic or latino
## TCGA-AR-A24Z-01A-11R-A169-07        white           not reported
## TCGA-D8-A1XU-01A-11R-A14M-07        white not hispanic or latino
## TCGA-A1-A0SN-01A-11R-A144-07        white not hispanic or latino
## TCGA-D8-A73W-01A-22R-A352-07        white not hispanic or latino
##                              year_of_death classification_of_tumor
## TCGA-A8-A085-01A-11R-A00Z-07            --            not reported
## TCGA-A2-A0SY-01A-31R-A084-07            --            not reported
## TCGA-AR-A24Z-01A-11R-A169-07            --            not reported
## TCGA-D8-A1XU-01A-11R-A14M-07            --            not reported
## TCGA-A1-A0SN-01A-11R-A144-07            --            not reported
## TCGA-D8-A73W-01A-22R-A352-07            --            not reported
##                              last_known_disease_status
## TCGA-A8-A085-01A-11R-A00Z-07              not reported
## TCGA-A2-A0SY-01A-31R-A084-07              not reported
## TCGA-AR-A24Z-01A-11R-A169-07              not reported
## TCGA-D8-A1XU-01A-11R-A14M-07              not reported
## TCGA-A1-A0SN-01A-11R-A144-07              not reported
## TCGA-D8-A73W-01A-22R-A352-07              not reported
##                                             primary_diagnosis tumor_stage
## TCGA-A8-A085-01A-11R-A00Z-07 Infiltrating duct carcinoma, NOS   stage iib
## TCGA-A2-A0SY-01A-31R-A084-07           Lobular carcinoma, NOS  stage iiia
## TCGA-AR-A24Z-01A-11R-A169-07 Infiltrating duct carcinoma, NOS   stage iia
## TCGA-D8-A1XU-01A-11R-A14M-07 Infiltrating duct carcinoma, NOS    stage ia
## TCGA-A1-A0SN-01A-11R-A144-07 Infiltrating duct carcinoma, NOS   stage iia
## TCGA-D8-A73W-01A-22R-A352-07          Mucinous adenocarcinoma  stage iiia
##                              age_at_diagnosis vital_status morphology
## TCGA-A8-A085-01A-11R-A00Z-07            16377        alive     8500/3
## TCGA-A2-A0SY-01A-31R-A084-07            22928        alive     8520/3
## TCGA-AR-A24Z-01A-11R-A169-07            20900        alive     8500/3
## TCGA-D8-A1XU-01A-11R-A14M-07            20715        alive     8500/3
## TCGA-A1-A0SN-01A-11R-A144-07            18401        alive     8500/3
## TCGA-D8-A73W-01A-22R-A352-07            29125         dead     8480/3
##                              days_to_death
## TCGA-A8-A085-01A-11R-A00Z-07            --
## TCGA-A2-A0SY-01A-31R-A084-07            --
## TCGA-AR-A24Z-01A-11R-A169-07            --
## TCGA-D8-A1XU-01A-11R-A14M-07            --
## TCGA-A1-A0SN-01A-11R-A144-07            --
## TCGA-D8-A73W-01A-22R-A352-07         385.0
##                              days_to_last_known_disease_status
## TCGA-A8-A085-01A-11R-A00Z-07                                --
## TCGA-A2-A0SY-01A-31R-A084-07                                --
## TCGA-AR-A24Z-01A-11R-A169-07                                --
## TCGA-D8-A1XU-01A-11R-A14M-07                                --
## TCGA-A1-A0SN-01A-11R-A144-07                                --
## TCGA-D8-A73W-01A-22R-A352-07                                --
##                              days_to_recurrence  tumor_grade
## TCGA-A8-A085-01A-11R-A00Z-07                 -- not reported
## TCGA-A2-A0SY-01A-31R-A084-07                 -- not reported
## TCGA-AR-A24Z-01A-11R-A169-07                 -- not reported
## TCGA-D8-A1XU-01A-11R-A14M-07                 -- not reported
## TCGA-A1-A0SN-01A-11R-A144-07                 -- not reported
## TCGA-D8-A73W-01A-22R-A352-07                 -- not reported
##                              tissue_or_organ_of_origin days_to_birth
## TCGA-A8-A085-01A-11R-A00Z-07               Breast, NOS      -16377.0
## TCGA-A2-A0SY-01A-31R-A084-07               Breast, NOS      -22928.0
## TCGA-AR-A24Z-01A-11R-A169-07               Breast, NOS      -20900.0
## TCGA-D8-A1XU-01A-11R-A14M-07               Breast, NOS      -20715.0
## TCGA-A1-A0SN-01A-11R-A144-07               Breast, NOS      -18401.0
## TCGA-D8-A73W-01A-22R-A352-07               Breast, NOS      -29125.0
##                              progression_or_recurrence prior_malignancy
## TCGA-A8-A085-01A-11R-A00Z-07              not reported     not reported
## TCGA-A2-A0SY-01A-31R-A084-07              not reported     not reported
## TCGA-AR-A24Z-01A-11R-A169-07              not reported     not reported
## TCGA-D8-A1XU-01A-11R-A14M-07              not reported     not reported
## TCGA-A1-A0SN-01A-11R-A144-07              not reported     not reported
## TCGA-D8-A73W-01A-22R-A352-07              not reported     not reported
##                              site_of_resection_or_biopsy
## TCGA-A8-A085-01A-11R-A00Z-07                 Breast, NOS
## TCGA-A2-A0SY-01A-31R-A084-07                 Breast, NOS
## TCGA-AR-A24Z-01A-11R-A169-07                 Breast, NOS
## TCGA-D8-A1XU-01A-11R-A14M-07                 Breast, NOS
## TCGA-A1-A0SN-01A-11R-A144-07                 Breast, NOS
## TCGA-D8-A73W-01A-22R-A352-07                 Breast, NOS
##                              days_to_last_follow_up therapeutic_agents
## TCGA-A8-A085-01A-11R-A00Z-07                 1124.0                 --
## TCGA-A2-A0SY-01A-31R-A084-07                 1347.0                 --
## TCGA-AR-A24Z-01A-11R-A169-07                 3001.0                 --
## TCGA-D8-A1XU-01A-11R-A14M-07                  395.0                 --
## TCGA-A1-A0SN-01A-11R-A144-07                 1196.0                 --
## TCGA-D8-A73W-01A-22R-A352-07                  244.0                 --
##                              treatment_intent_type treatment_or_therapy
## TCGA-A8-A085-01A-11R-A00Z-07                    --                   --
## TCGA-A2-A0SY-01A-31R-A084-07                    --                   --
## TCGA-AR-A24Z-01A-11R-A169-07                    --                   --
## TCGA-D8-A1XU-01A-11R-A14M-07                    --                   --
## TCGA-A1-A0SN-01A-11R-A144-07                    --                   --
## TCGA-D8-A73W-01A-22R-A352-07                    --                   --
##                              sample_submitter_id case_submitter_id
## TCGA-A8-A085-01A-11R-A00Z-07    TCGA-A8-A085-01A      TCGA-A8-A085
## TCGA-A2-A0SY-01A-31R-A084-07    TCGA-A2-A0SY-01A      TCGA-A2-A0SY
## TCGA-AR-A24Z-01A-11R-A169-07    TCGA-AR-A24Z-01A      TCGA-AR-A24Z
## TCGA-D8-A1XU-01A-11R-A14M-07    TCGA-D8-A1XU-01A      TCGA-D8-A1XU
## TCGA-A1-A0SN-01A-11R-A144-07    TCGA-A1-A0SN-01A      TCGA-A1-A0SN
## TCGA-D8-A73W-01A-22R-A352-07    TCGA-D8-A73W-01A      TCGA-D8-A73W
##                              sample_type_id
## TCGA-A8-A085-01A-11R-A00Z-07              1
## TCGA-A2-A0SY-01A-31R-A084-07              1
## TCGA-AR-A24Z-01A-11R-A169-07              1
## TCGA-D8-A1XU-01A-11R-A14M-07              1
## TCGA-A1-A0SN-01A-11R-A144-07              1
## TCGA-D8-A73W-01A-22R-A352-07              1
##                              time_between_excision_and_freezing
## TCGA-A8-A085-01A-11R-A00Z-07                                 --
## TCGA-A2-A0SY-01A-31R-A084-07                                 --
## TCGA-AR-A24Z-01A-11R-A169-07                                 --
## TCGA-D8-A1XU-01A-11R-A14M-07                                 --
## TCGA-A1-A0SN-01A-11R-A144-07                                 --
## TCGA-D8-A73W-01A-22R-A352-07                                 --
##                              oct_embedded tumor_code_id
## TCGA-A8-A085-01A-11R-A00Z-07        false            --
## TCGA-A2-A0SY-01A-31R-A084-07         true            --
## TCGA-AR-A24Z-01A-11R-A169-07         true            --
## TCGA-D8-A1XU-01A-11R-A14M-07        false            --
## TCGA-A1-A0SN-01A-11R-A144-07         true            --
## TCGA-D8-A73W-01A-22R-A352-07        false            --
##                              intermediate_dimension is_ffpe
## TCGA-A8-A085-01A-11R-A00Z-07                     --   False
## TCGA-A2-A0SY-01A-31R-A084-07                     --   False
## TCGA-AR-A24Z-01A-11R-A169-07                     --   False
## TCGA-D8-A1XU-01A-11R-A14M-07                     --   False
## TCGA-A1-A0SN-01A-11R-A144-07                     --   False
## TCGA-D8-A73W-01A-22R-A352-07                     --   False
##                                             pathology_report_uuid
## TCGA-A8-A085-01A-11R-A00Z-07 64F84FF4-A477-4E1E-B4BB-E5614517229E
## TCGA-A2-A0SY-01A-31R-A084-07 8E6902A6-A673-46CC-9AEB-3A71EF11099F
## TCGA-AR-A24Z-01A-11R-A169-07 AD07F611-0EEA-4890-A02C-6DA3F5F57C45
## TCGA-D8-A1XU-01A-11R-A14M-07 845F8FCF-CF3C-4CEF-B673-A57DE626939C
## TCGA-A1-A0SN-01A-11R-A144-07 D0269758-EFAE-4EBA-8CCF-4A6CF4D4B35A
## TCGA-D8-A73W-01A-22R-A352-07 359DB5F2-BD23-42E1-B316-9D908DBACD78
##                              tumor_descriptor   sample_type
## TCGA-A8-A085-01A-11R-A00Z-07               -- Primary Tumor
## TCGA-A2-A0SY-01A-31R-A084-07               -- Primary Tumor
## TCGA-AR-A24Z-01A-11R-A169-07               -- Primary Tumor
## TCGA-D8-A1XU-01A-11R-A14M-07               -- Primary Tumor
## TCGA-A1-A0SN-01A-11R-A144-07               -- Primary Tumor
## TCGA-D8-A73W-01A-22R-A352-07               -- Primary Tumor
##                              distance_normal_to_tumor
## TCGA-A8-A085-01A-11R-A00Z-07                 released
## TCGA-A2-A0SY-01A-31R-A084-07                 released
## TCGA-AR-A24Z-01A-11R-A169-07                 released
## TCGA-D8-A1XU-01A-11R-A14M-07                 released
## TCGA-A1-A0SN-01A-11R-A144-07                 released
## TCGA-D8-A73W-01A-22R-A352-07                 released
##                              biospecimen_anatomic_site state
## TCGA-A8-A085-01A-11R-A00Z-07                        --    --
## TCGA-A2-A0SY-01A-31R-A084-07                        --    --
## TCGA-AR-A24Z-01A-11R-A169-07                        --    --
## TCGA-D8-A1XU-01A-11R-A14M-07                        --    --
## TCGA-A1-A0SN-01A-11R-A144-07                        --    --
## TCGA-D8-A73W-01A-22R-A352-07                        --    --
##                              diagnosis_pathologically_confirmed
## TCGA-A8-A085-01A-11R-A00Z-07                                 --
## TCGA-A2-A0SY-01A-31R-A084-07                                 --
## TCGA-AR-A24Z-01A-11R-A169-07                                 --
## TCGA-D8-A1XU-01A-11R-A14M-07                                 --
## TCGA-A1-A0SN-01A-11R-A144-07                                 --
## TCGA-D8-A73W-01A-22R-A352-07                                 --
##                              current_weight composition
## TCGA-A8-A085-01A-11R-A00Z-07             --          --
## TCGA-A2-A0SY-01A-31R-A084-07             --          --
## TCGA-AR-A24Z-01A-11R-A169-07             --          --
## TCGA-D8-A1XU-01A-11R-A14M-07             --          --
## TCGA-A1-A0SN-01A-11R-A144-07             --          --
## TCGA-D8-A73W-01A-22R-A352-07             --          --
##                              time_between_clamping_and_freezing
## TCGA-A8-A085-01A-11R-A00Z-07                                 --
## TCGA-A2-A0SY-01A-31R-A084-07                                 --
## TCGA-AR-A24Z-01A-11R-A169-07                                 --
## TCGA-D8-A1XU-01A-11R-A14M-07                                 --
## TCGA-A1-A0SN-01A-11R-A144-07                                 --
## TCGA-D8-A73W-01A-22R-A352-07                                 --
##                              distributor_reference shortest_dimension
## TCGA-A8-A085-01A-11R-A00Z-07                    --                 --
## TCGA-A2-A0SY-01A-31R-A084-07                    --                 --
## TCGA-AR-A24Z-01A-11R-A169-07                    --                 --
## TCGA-D8-A1XU-01A-11R-A14M-07                    --                 --
## TCGA-A1-A0SN-01A-11R-A144-07                    --                 --
## TCGA-D8-A73W-01A-22R-A352-07                    --                 --
##                              method_of_sample_procurement tumor_code
## TCGA-A8-A085-01A-11R-A00Z-07                           --        788
## TCGA-A2-A0SY-01A-31R-A084-07                           --       1083
## TCGA-AR-A24Z-01A-11R-A169-07                           --       1673
## TCGA-D8-A1XU-01A-11R-A14M-07                           --        102
## TCGA-A1-A0SN-01A-11R-A144-07                           --       1091
## TCGA-D8-A73W-01A-22R-A352-07                           --        191
##                              passage_count tissue_type
## TCGA-A8-A085-01A-11R-A00Z-07         130.0          --
## TCGA-A2-A0SY-01A-31R-A084-07         510.0          --
## TCGA-AR-A24Z-01A-11R-A169-07         120.0          --
## TCGA-D8-A1XU-01A-11R-A14M-07         210.0          --
## TCGA-A1-A0SN-01A-11R-A144-07         120.0          --
## TCGA-D8-A73W-01A-22R-A352-07         230.0          --
##                              biospecimen_laterality
## TCGA-A8-A085-01A-11R-A00Z-07                     --
## TCGA-A2-A0SY-01A-31R-A084-07                     --
## TCGA-AR-A24Z-01A-11R-A169-07                     --
## TCGA-D8-A1XU-01A-11R-A14M-07                     --
## TCGA-A1-A0SN-01A-11R-A144-07                     --
## TCGA-D8-A73W-01A-22R-A352-07                     --
##                              days_to_sample_procurement freezing_method
## TCGA-A8-A085-01A-11R-A00Z-07                         --              --
## TCGA-A2-A0SY-01A-31R-A084-07                         --              --
## TCGA-AR-A24Z-01A-11R-A169-07                         --              --
## TCGA-D8-A1XU-01A-11R-A14M-07                         --              --
## TCGA-A1-A0SN-01A-11R-A144-07                         --              --
## TCGA-D8-A73W-01A-22R-A352-07                         --              --
##                              preservation_method growth_rate
## TCGA-A8-A085-01A-11R-A00Z-07                  --          --
## TCGA-A2-A0SY-01A-31R-A084-07                  --          --
## TCGA-AR-A24Z-01A-11R-A169-07                  --          --
## TCGA-D8-A1XU-01A-11R-A14M-07                  --          --
## TCGA-A1-A0SN-01A-11R-A144-07                  --          --
## TCGA-D8-A73W-01A-22R-A352-07                  --          --
##                              days_to_collection catalog_reference
## TCGA-A8-A085-01A-11R-A00Z-07                 --                --
## TCGA-A2-A0SY-01A-31R-A084-07                 --                --
## TCGA-AR-A24Z-01A-11R-A169-07                 --                --
## TCGA-D8-A1XU-01A-11R-A14M-07                 --                --
## TCGA-A1-A0SN-01A-11R-A144-07                 --                --
## TCGA-D8-A73W-01A-22R-A352-07                 --                --
##                              initial_weight longest_dimension
## TCGA-A8-A085-01A-11R-A00Z-07             --                --
## TCGA-A2-A0SY-01A-31R-A084-07             --                --
## TCGA-AR-A24Z-01A-11R-A169-07             --                --
## TCGA-D8-A1XU-01A-11R-A14M-07             --                --
## TCGA-A1-A0SN-01A-11R-A144-07             --                --
## TCGA-D8-A73W-01A-22R-A352-07             --                --
##                              tumor_subtype
## TCGA-A8-A085-01A-11R-A00Z-07          LumB
## TCGA-A2-A0SY-01A-31R-A084-07          LumA
## TCGA-AR-A24Z-01A-11R-A169-07          LumB
## TCGA-D8-A1XU-01A-11R-A14M-07          LumA
## TCGA-A1-A0SN-01A-11R-A144-07          Her2
## TCGA-D8-A73W-01A-22R-A352-07          LumB

Example

ex1 <- df %>%
       filter(!is.na(tumor_subtype)) %>%
       group_by(tumor_subtype) %>%
       summarize(PC1_mean = mean(PC1, na.rm=T))

print(ex1)
## # A tibble: 5 x 2
##   tumor_subtype PC1_mean
##   <chr>            <dbl>
## 1 Basal           16.8  
## 2 Her2            -0.413
## 3 LumA            -4.57 
## 4 LumB            -4.23 
## 5 Normal          10.4
ggplot(data=ex1, aes(x=tumor_subtype, y=PC1_mean)) +
geom_point(aes(shape=tumor_subtype, color=tumor_subtype), size=10)+
ylab("PC1  ~ Basal Genes") +
xlab("Tumor Subtype") +
labs(title="Important Genes", 
     subtitle="Average PC1 Across Tumor Subtypes",
     caption="Figure 1: Example of data wrangling into ggplot")

Q1

Make a boxplot of PC1 across tumor subtypes.

Q2

Make a scatter of PC1 across tumor subtypes.

Q3

Make a boxplot of PC1 across tumor stages within each gender.

Q4

Make a density plot of PC1 across tumor stages within females.

Q5

Make a correlation plot of PC1 with age.